1. Preliminaries: Load Libraries and Log into Twitter

library(rtweet)
library(httpuv)
library(tidyverse)
library(tidytext)
library(wordcloud2)
library(qdapRegex)
library(tm)
library(webshot)
library(htmlwidgets)
library(ggplot2)

2. Pull Data from the Twitter API via rtweet Package

# Pull tweets with #CancelStudentDebt; returns 1000 most recent tweets; time by GMT
student_debt_tweets<-search_tweets(q="#CancelStudentDebt", 
                                   n=1000,
                                   include_rts=FALSE,
                                   `-filter`="replies",
                                   lang="en")
# Pull tweets with #CancelStudentDebt AND capitalism
student_debt_capitalism_tweets<-search_tweets(q="#CancelStudentDebt capitalism", 
                                              n=1000,
                                              include_rts=FALSE,
                                              `-filter`="replies",
                                              lang="en")
# Insteading of pulling from the API, you could also pull tweets with #CancelStudentDebt, and then query the text
# of these tweets locally using a stringr function

student_debt_capitalism_tweets_ALT<-student_debt_tweets %>% 
                                     filter(str_detect(text, "[Cc]apitalism"))
# Pull tweets with #CancelStudentDebt OR capitalism

student_debt_OR_capitalism_tweets<-search_tweets(q="#CancelStudentDebt OR capitalism", 
                                                 n=1000,
                                                 include_rts=FALSE,
                                                 `-filter`="replies",
                                                 lang="en")
# # Pull tweets from an account (doesn't have same time constraints)
# Pull last 3200 BLM tweets (note sometimes the query will return less than 3200 due to deletions)
blm_tweets<-get_timeline("@Blklivesmatter", n=3200)

3. Clean, Organize, and Query Twitter Datasets

3.1. Query blm_tweets to find the 10 tweets with the most favorites

blm_tweets_most_favorited<-blm_tweets %>% slice_max(favorite_count, n=10)

3.2. Remove unnecessary columns from “blm_tweets_most_favorited”

blm_tweets_most_favorited<- blm_tweets_most_favorited %>% 
                              select(created_at, screen_name, text, favorite_count)
blm_tweets_most_favorited
## # A tibble: 10 x 4
##    created_at          screen_name  text                          favorite_count
##    <dttm>              <chr>        <chr>                                  <dbl>
##  1 2020-08-29 03:31:41 Blklivesmat… "Beyond painful. Rest in Pow…          53597
##  2 2021-01-06 19:42:10 Blklivesmat… "So we all just gonna act li…          46103
##  3 2020-06-15 15:34:41 Blklivesmat… "Until today, you could be f…          44755
##  4 2020-05-31 03:21:51 Blklivesmat… "We call for an END to syste…          33327
##  5 2020-05-29 21:39:43 Blklivesmat… "Rest in Power, Beautiful. @…          32572
##  6 2020-05-26 17:38:48 Blklivesmat… "His name was George Floyd. …          31084
##  7 2020-06-09 21:24:28 Blklivesmat… "You have changed us forever…          29776
##  8 2020-06-05 14:04:00 Blklivesmat… "Happy 27th birthday, Breonn…          26147
##  9 2020-10-11 00:22:13 Blklivesmat… "#BlackLivesMatter rises wit…          23265
## 10 2020-06-03 11:59:10 Blklivesmat… "When people take to the str…          22675

3.3 Query blm_tweets to find the 10 tweets with the most retweets and then select desired columns in one block of code

blm_tweets_most_retweeted<-blm_tweets %>% 
                              slice_max(retweet_count, n=10) %>% 
                              select(created_at, screen_name, text, retweet_count)
blm_tweets_most_retweeted
## # A tibble: 10 x 4
##    created_at          screen_name   text                          retweet_count
##    <dttm>              <chr>         <chr>                                 <dbl>
##  1 2020-08-27 02:50:05 Blklivesmatt… "FUCK THIS MAN!!!! WE DEMAND…        264125
##  2 2020-10-11 01:40:00 Blklivesmatt… "A thread on what’s happenin…         51097
##  3 2020-05-03 17:42:05 Blklivesmatt… "*Blinks in BLM*  https://t.…         48906
##  4 2021-01-07 12:40:38 Blklivesmatt… "They've killed us for less!"         43303
##  5 2020-06-09 00:10:55 Blklivesmatt… "3 million students attend s…         41545
##  6 2020-07-18 16:50:58 Blklivesmatt… "55 years ago today, we were…         40229
##  7 2020-12-24 00:46:49 Blklivesmatt… "Move, Mitch, get out the wa…         39516
##  8 2020-05-03 17:42:58 Blklivesmatt… "Think about how harshly #Bl…         39207
##  9 2020-06-14 16:39:14 Blklivesmatt… "A heartbreaker. \n\nNext we…         28458
## 10 2021-01-18 18:38:11 Blklivesmatt… "A thread of Dr. King in col…         28395

3.4 Remove retweets from blm_tweets

blm_tweets_noretweets<-blm_tweets %>% filter(is_retweet=="FALSE")

3.6 # Query the data to find the 5 handles that have most frequently used #CancelStudentLoan

student_debt_tweets_frequentweeters<-student_debt_tweets %>% 
                                      count(screen_name) %>% 
                                      slice_max(n, n=5)

3.7 Query the data to find the 10 hashtags appearing most frequently in conjunction with #CancelStudentDebt

CancelStudentDebt_coinciding_hashtags<-student_debt_tweets %>% 
                                          select(hashtags) %>% 
                                          unnest(hashtags) %>%
                                          mutate(hashtag_cleaned=str_to_lower(hashtags)) %>% 
                                          filter(hashtag_cleaned!="cancelstudentdebt") %>% 
                                          select(-hashtag_cleaned) %>% 
                                          count(hashtags) %>% 
                                          slice_max(n, n=10)

4. Visualize Data

4.1 Using ggplot to make visualizations of twitter data: bar graph of coincident hashtags

CancelStudentDebt_coinciding_hashtags<-CancelStudentDebt_coinciding_hashtags %>% 
                                        mutate(hashtag=paste0("#", hashtags))

coincident_hashtags_plot<-
  ggplot(CancelStudentDebt_coinciding_hashtags, aes(x=reorder(hashtag, n), y=n))+
    geom_bar(stat="identity")+
      coord_flip()+
      xlab("")+
      ylab("Frequency")+
      ggtitle("Hashtags Most Frequently Used Along With #CancelStudentDebt")+
      labs(caption = "Data Collected from Twitter REST API via rtweet")
coincident_hashtags_plot

4.2. Using rtweet’s visualization functions: time series example

ts_plot(student_debt_tweets, "hours") +
  labs(x = NULL, y = NULL,
       title = "Frequency of tweets with a #CancelStudentDebt hashtag",
       subtitle = paste0(format(min(student_debt_tweets$created_at), "%d %B %Y"), 
                         " to ",  
                         format(max(student_debt_tweets$created_at),"%d %B %Y")),
       caption = "Data collected from Twitter's REST API via rtweet") +
  theme_minimal()

4b. Word Cloud

blm_text<-str_c(blm_tweets$text, collapse="")


blm_text <- 
  blm_text %>%
  str_remove("\\n") %>%                   # remove linebreaks
  rm_twitter_url() %>%                    # Remove URLS
  rm_url() %>%
  str_remove_all("#\\S+") %>%             # Remove any hashtags
  str_remove_all("@\\S+") %>%             # Remove any @ mentions
  removeWords(stopwords("english")) %>%   # Remove common words (a, the, it etc.)
  removeNumbers() %>%
  stripWhitespace() %>%
  removeWords(c("amp"))                   # Final cleanup of other small changes


textCorpus <- 
  Corpus(VectorSource(blm_text)) %>%
  TermDocumentMatrix() %>%
  as.matrix()

textCorpus <- sort(rowSums(textCorpus), decreasing=TRUE)
textCorpus <- data.frame(word = names(textCorpus), freq=textCorpus, row.names = NULL)
textCorpus<-textCorpus %>% filter(word!="the")

wordcloud_blm <- wordcloud2(data = textCorpus, minRotation = 0, maxRotation = 0, ellipticity = 0.2)
wordcloud_blm

You can write out your word cloud to disk with the following:

install_phantomjs()
saveWidget(wordcloud_blm, "blm.html", selfcontained = F)
webshot("blm.html", "blm.png", vwidth=1000, vheight=1000, delay=10)

5. Writing Functions and Automating Your Twitter Analysis

5a. Wrap the code to create a word cloud into a function

twitter_wordcloud<-function(twitterhandle, tweet_number){
  tweet_timeline<-get_timeline(twitterhandle, n=tweet_number)
  tweet_timeline_text<-str_c(tweet_timeline$text, collapse="")

    tweet_timeline_text<-tweet_timeline_text %>%
    str_remove("\\n") %>%                   # remove linebreaks
    rm_twitter_url() %>%                    # Remove URLS
    rm_url() %>%
    str_remove_all("#\\S+") %>%             # Remove any hashtags
    str_remove_all("@\\S+") %>%             # Remove any @ mentions
    removeWords(stopwords("english")) %>%   # Remove common words (a, the, it etc.)
    removeNumbers() %>%
    stripWhitespace() %>%
    removeWords(c("amp")) %>% 
    removePunctuation() %>% 
    str_remove_all(pattern='[Tt]he') %>% 
    str_remove_all(pattern='[:emoji:]')
    
  
  textCorpus <- 
    Corpus(VectorSource(tweet_timeline_text)) %>%
    TermDocumentMatrix() %>%
    as.matrix()
  
  textCorpus <- sort(rowSums(textCorpus), decreasing=TRUE)
  textCorpus <- data.frame(word = names(textCorpus), freq=textCorpus, row.names = NULL)

  wordcloud <- wordcloud2(data = textCorpus, minRotation = 0, maxRotation = 0, ellipticity = 0.2)
  return(wordcloud)
  
}

5b. Test the function

# Generate word cloud for past 400 NYT twitter posts, assign to object, and view word cloud
nyt_wordcloud<-twitter_wordcloud("nytimes", 400)
# View NYT wordcloud 
nyt_wordcloud

5c. Iteratively Apply the Function to Multiple Twitter Handles

Apply the “twitter_wordcloud” function created above to multiple handles, and generate multiple word clouds based on those handles. We’ll apply the function to the Twitter handles of the New York Times, Financial Times, Washington Post, Fox News, CNN, and the Denver Post.

handles<-c("nytimes", "FinancialTimes", "FoxNews", "cnn", "washingtonpost", "denverpost")
number<-c(400)
wordcloud_list<-map2(.x=handles, .y=number, twitter_wordcloud)

View the Washington Post word cloud by accessing it from the list:

# View Washington Post Word Cloud
wordcloud_list[["washingtonpost"]]

View the Denver Post word cloud by accessing from the list:

# View Denver Post word cloud by extracting it from the list
wordcloud_list[["denverpost"]]
setwd("~/Documents/git_repositories/twitter_workshop")
knitr::include_graphics("images/denverpost.png")

What would you type if you want to extract the Financial Times word cloud from the list?

5d. Iteratively Write Out All of the Media Word Clouds to Disk

# Write function that takes list of word clouds, and word cloud names, and writes WC out to tisk
output_wordclouds<-function(wordclouds_to_export, wordcloud_names){
  setwd("/Users/adra7980/Documents/git_repositories/twitter_workshop/wordclouds")
  install_phantomjs()
  saveWidget(wordclouds_to_export, paste0(wordcloud_names, ".html"), selfcontained=F)
  webshot(paste0(wordcloud_names, ".html"), paste0(wordcloud_names, ".png"), vwidth=1992, vheight=1744, delay=10)
}
# iteratively apply previous function across word clouds in list and write all to disk
map2(.x=wordcloud_list, .y=names(wordcloud_list), .f=output_wordclouds)